#######################################################################################
# Replication materials for 
#
# Richard Traunm�ller, Andreas Murr &  Jeff Gill 
# "Modeling Latent Information in Voting Data with Dirichlet Process Priors"
# Political Analysis
#
# Version:  October 7th 2014
#
# Requires: GlesWuest.dta (dataset)
#           RecodeGles2009LikeWuest2011.do (create replication data set, optional)
#           JeffGlmdmCode.R (glmdm model code)
# 					Cval4by10Fold.RData (cross-validation workspace)
#						4by10CrossValidation.R (cross-validation exercise code, optional)
#           SavedResults.RData (saved results and posterior simulations workspace)
#
#######################################################################################

#######################################################################################
# load r-packages

	source("JeffGlmdmCode.R")
	require(foreign)
	require(Hmisc)
	require(ROCR)
	require(arm)
	require(xtable)

#######################################################################################
# write functions for display of results

glm.tab <- function(fit){                 # Function to create GLM summary table 
  round(cbind(coef(fit), coef(fit)-1.96*se.coef(fit), coef(fit)+1.96*se.coef(fit)), 2)
}

mcmc.table <- function(fit){              # Function to create GLMDM summary table
  cbind(round(apply(fit$beta.sims, 2, mean),3), round(apply(fit$beta.sims, 2, sd),3), round(t(apply(fit$beta.sims[,], 2, quantile, c(.025, .975))),3))
}

CoefPlot <- function(fit, title){          # Function to plot GLMDM estimates
  par(mgp=c(2, 0.5, 0), tck=-0.005)
  post.mean <- apply(fit$beta.sims, 2, mean) 
  plot(post.mean, (length(post.mean):1)-.2, pch=17, main=title, axes=F, cex=.7, ylab="", xlab="Posterior Means & 95% HPDs", xlim=c(-2.5, 2.5), cex.lab=.7, cex.main=.7)
  abline(h=length(post.mean):1, col="grey92")
  abline(v=seq(-3, 2, by=1), col="grey92")
  axis(1, tck=-.005, cex.lab=.7, cex.axis=.7)
  points(post.mean, (length(post.mean):1)-.2, pch=17, cex=.7)
  segments(t(apply(fit$beta.sims[,], 2, quantile, .025)), (length(post.mean):1)-.2, t(apply(fit$beta.sims[,], 2, quantile, .975)), (length(post.mean):1)-.2) 
  abline(v=0, lty=2)
  #box()
} 

AddPlot <- function(fit){                 # Function to add GLM estimates to plot
  points(coef(fit), (length(coef(fit)):1)+.2, pch=21, cex=.7)
  segments(coef(fit)-1.96*se.coef(fit), (length(coef(fit)):1)+.2, coef(fit)+1.96*se.coef(fit), (length(coef(fit)):1)+.2, col="grey48")
  points(coef(fit), (length(coef(fit)):1)+.2, pch=19, cex=.5, col="white")
}

plotRocAvgPdf <- function(p1,p2,a){       # Function for plotting average ROCs
  pred1 <- prediction(p1,a)
  pred2 <- prediction(p2,a)
  perf1 <- performance(pred1, "tpr", "fpr")
  perf2 <- performance(pred2, "tpr", "fpr")
  plot(perf1,avg="threshold",lty=2,col="grey48",add=TRUE)
  plot(perf2,avg="threshold",add=TRUE)
}

#######################################################################################
# load data

data <- read.dta("GlesWuest.dta", convert.factors=F)

# OR: replicate this data set by downloading the original dataset ("ZA5302_en_v6-0-0.dta") from 
# the GESIS website: www.gesis.org/wahlen/gles/ 
# and recoding it with "RecodeGles2009LikeWuest2011.do"
# this do file saves the data set as "GlesWuestReplication.dta" 
# which you can load in R with
# data <- read.dta("GlesWuestReplication.dta", convert.factors=F)



#######################################################################################
# generate new variables

data$cdu <- ifelse(data$secondvote==1, 1, 0)
data$left <- ifelse(data$secondvote==2 | data$secondvote==4 | data$secondvote==5, 1, 0)
data$partisanship <- ifelse(data$pid==1, 1, 0)

#######################################################################################
# standardize (quasi-)metric variables (cf. Gelman & Hill 2007)

data$age <- (data$age - mean(data$age, na.rm = TRUE)) / (2*sd(data$age, na.rm = TRUE))
data$class <- (data$class - mean(data$class, na.rm = TRUE)) / (2*sd(data$class, na.rm = TRUE))
data$interest <- (data$interest - mean(data$interest, na.rm = TRUE)) / (2*sd(data$interest, na.rm = TRUE))
data$strengthpid <- (data$strengthpid - mean(data$strengthpid, na.rm = TRUE)) / (2*sd(data$strengthpid, na.rm = TRUE))
data$civicduty <- (data$civicduty - mean(data$civicduty, na.rm = TRUE)) / (2*sd(data$civicduty, na.rm = TRUE))
data$absdiffeval <- (data$absdiffeval - mean(data$absdiffeval, na.rm = TRUE)) / (2*sd(data$absdiffeval, na.rm = TRUE))
data$satisf <- (data$satisf - mean(data$satisf, na.rm = TRUE)) / (2*sd(data$satisf, na.rm = TRUE))
data$attendance <- (data$attendance - mean(data$attendance, na.rm = TRUE)) / (2*sd(data$attendance, na.rm = TRUE))
data$leftright <- (data$leftright - mean(data$leftright, na.rm = TRUE)) / (2*sd(data$leftright, na.rm = TRUE))
data$spending <- (data$spending - mean(data$spending, na.rm = TRUE)) / (2*sd(data$spending, na.rm = TRUE))
data$immigration <- (data$immigration - mean(data$immigration, na.rm = TRUE)) / (2*sd(data$immigration, na.rm = TRUE))
data$nuclear <- (data$nuclear - mean(data$nuclear, na.rm = TRUE)) / (2*sd(data$nuclear, na.rm = TRUE))

#######################################################################################
# select post-election wave and respondents with migration background
# one data set for turnout and vote choice, respectively

dataTur <- na.omit(subset(data[data$survey==2 & data$migrbckgr==1,], select = c("turnout", "firstgeneration", "aussiedler", "guestworker", "otherlanguage", "other", "female", "age", "educ", "employ", "member", "class", "interest", "strengthpid", "civicduty", "absdiffeval", "satisf")))
dataCdu <- na.omit(subset(data[data$survey==2 & data$migrbckgr==1,], select = c("cdu", "firstgeneration", "aussiedler", "guestworker", "otherlanguage", "other", "female", "age", "educ", "employ", "member", "attendance", "worker", "selfempl", "leftright", "spending", "immigration", "nuclear", "partisanship")))

#######################################################################################
# ============
# = figure 1 =
# ============	
# Simulated draws from a Dirichlet Process with base measure G_0 = N(0,1) and three different precision parameters lambda.

	par(mfrow=c(1,3), mar=c(4,1,4,1), mgp=c(2, .5, 0))

	stick.sim <- function(m, title){
		n <- 100
		a <- m
		stick <- 1
		w <- rbeta(1, 1, a) * stick
		for(i in 2:n){  
		  stick <- stick - w[i-1]
		  w[i] <- rbeta(1, 1, a) * stick 
		}
		g <- rmultinom(1, length(w), w)
		B <- g>0 
		B[B] <- 1
		B <- B*rnorm(n)
		plot(table(rep(B[1:n], g[1:n])), axes=F, ylim=c(0, 20), xlim=c(-3, 3), xlab=expression(psi), ylab="", main=title, cex.main=1)
		axis(1, tck=-.005, cex.axis=.8)
		x <- seq(-3, 3, length=n)
		points(x, dnorm(x)*20, type="l", col="grey30", lty=2)
		text(0, dnorm(0)*20, expression(G[0]), col="grey30", pos=3)
		text(0, dnorm(0)*20, expression(G), pos=1)
	}

  pdf("dirichlet_sim.pdf", width=6, height=5)  
	stick.sim(5, expression(paste(lambda, " = 5")))
	stick.sim(50, expression(paste(lambda, " = 50")))
	stick.sim(150, expression(paste(lambda, " = 150")))
  dev.off()



########################################################################################
# ============
# = figure 2 =
# ============
# ROC curves of out-of-sample predictive performance of GLMDM probit and GLM probit models. Results from a 4-fold cross-validation with 10 iterations.

# load workspace

  load("Cval4by10Fold.RData")

# OR: replicate the cross-validation exercise with
# source("4by10CrossValidation.R")
# this file saves the workspace in the format: paste("Cval4by10Fold", format(Sys.time(), "%d%b%Y"), ".RData", sep = "")

# create plots and save as pdf	

	pdf("roc_cval_turnout_bw.pdf", width=6, height=5)	
	par(mfrow=c(1,1), mar = c(4, 4, 2, 4), mgp = c(2.2, .5, 0), tck = -0.005)
	plot(c(0:1), c(0:1), pch = "", cex.main = .9, main = "Turnout: Comparison of Model Fit (Out of Sample)", ylab = "Average True Positive Rate", xlab = "Average False Positive Rate", las = 1)
	abline(h = seq(0, 1, by = .2), col = "grey92")
	abline(v = seq(0, 1, by = .2), col = "grey92")
	abline(a = 0, b = 1)
	p1 <- CvalTur$phats.glm
	p2 <- CvalTur$phats.glmdm
	a <- CvalTur$actuals
	plotRocAvgPdf(p1,p2,a)
	text(.1, .95, "GLMDM",  pos=4, cex=.7)
	text(.2, .55, "GLM", col="grey48", pos=4, cex=.7)	
	dev.off()

	pdf("roc_cval_votechoicewithpid_bw.pdf", width=6, height=5)	
	par(mfrow=c(1,1), mar = c(4, 4, 2, 4), mgp = c(2.2, .5, 0), tck = -0.005)
	plot(c(0:1), c(0:1), pch = "", cex.main = .9, main = "Vote Choice: Comparison of Model Fit (Out of Sample)", ylab = "Average True Positive Rate", xlab = "Average False Positive Rate", las = 1)
	abline(h = seq(0, 1, by = .2), col = "grey92")
	abline(v = seq(0, 1, by = .2), col = "grey92")
	abline(a = 0, b = 1)
	p1 <- CvalPid$phats.glm
	p2 <- CvalPid$phats.glmdm
	a <- CvalPid$actuals
	plotRocAvgPdf(p1,p2,a)
	text(.1, .95, "GLMDM",  pos=4, cex=.7)
	text(.2, .75, "GLM", col="grey48", pos=4, cex=.7)	
	dev.off()

#######################################################################################
# ============
# = figure 3 =
# ============  
# Comparison of the size of the fixed effect coefficients' standard errors from GLMDM probit and GLM probit models.

# estimate models
# turnout 

	formula.tur <- "turnout ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + class + interest + strengthpid + civicduty + absdiffeval + satisf"
	tur.glm <- glm(formula.tur, data=dataTur, family=binomial(link="probit"), x=TRUE)  # GLM
	tur.glmdm <- glmdm(formula.tur, data=dataTur, family=binomial(link="probit"), num.reps=5000)  # GLMDM

# vote choice 

	formula.cdu <- "cdu ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + attendance + worker + selfempl + leftright + spending + immigration + nuclear + partisanship"
  cdu.glm <- glm(formula.cdu, data=dataCdu, family=binomial(link="probit"), x=TRUE)  # GLM
	cdu.glmdm <- glmdm(formula.cdu, data=dataCdu, family=binomial(link="probit"), num.reps=5000)  # GLMDM

# OR: load in saved GLM estimates and GLMDM posterior simulations 
# to reproduce EXACT results without simulation error
# load("SavedResults.RData")

# plot standard errors

	pdf("se_turnout_bw.pdf", width=6, height=5)
	par(mar=c(4,4,2,4), mgp=c(2.2, .5, 0), tck=-0.005)
	plot(se.coef(tur.glm), apply(tur.glmdm$beta.sims, 2, sd), pch=21, xlim=c(0, 1.1), ylim=c(0,1.1), cex.main=.9, main="Turnout: Comparison of SEs", xlab="SEs of GLM", ylab="SEs of GLMDM", las=1)
	abline(h=seq(0, 1, by=.2), col="grey92")
	abline(v=seq(0, 1, by=.2), col="grey92")
	points(se.coef(tur.glm), apply(tur.glmdm$beta.sims, 2, sd), pch=21)
	abline(a=0,b=1)
	dev.off()
	
	pdf("se_votechoice_bw.pdf", width=6, height=5)
	par(mar=c(4,4,2,4), mgp=c(2.2, .5, 0), tck=-0.005)
	plot(se.coef(cdu.glm), apply(cdu.glmdm$beta.sims, 2, sd), pch=21, xlim=c(0, 1.1), ylim=c(0,1.1), cex.main=.9, main="Vote Choice: Comparison of SEs", xlab="SEs of GLM", ylab="SEs of GLMDM", las=1)
	abline(h=seq(0, 1, by=.2), col="grey92")
	abline(v=seq(0, 1, by=.2), col="grey92")
	points(se.coef(cdu.glm), apply(cdu.glmdm$beta.sims, 2, sd), pch=21)
	abline(a=0,b=1)
	dev.off()

#######################################################################################
# ============
# = figure 4 =
# ============
# Posterior means and 95% highest probability densities (HPD) of GLMDM probit and GLM probit models for immigrants' turnout in the 2009 German federal election.

	pdf("coef_turnout_bw.pdf", width=6, height=5)
	par(mar=c(3,8,2,2), tck=-.005)
	CoefPlot(tur.glmdm, "Immigrant Turnout")
	axis(2, at=(length(coef(tur.glm)):1), labels=names(coef(tur.glm)), las=1, tck=-.005, cex.axis=.7)
	text(1.8, 16.2, "GLM", cex=.7, col="grey48", pos=4)
	text(1.8, 15.6, "GLMDM", cex=.7, pos=4)
	points(1.8, 16.2, pch=21, cex=.7)
	points(1.8, 15.6, pch=17, cex=.7)
	AddPlot(tur.glm)
	box()
	dev.off()

#######################################################################################
# ===========
# = table 1 =
# ===========
# Posterior means and 95% highest probability densities (HPD) of GLM and GLMDM probit models for immigrants' turnout in the 2009 German federal election.

	res.tab.1 <- round(cbind(glm.tab(tur.glm), mcmc.table(tur.glmdm)[,c(1,3,4)]), 2)
	xtable(res.tab.1, type="latex", caption=c("Results for turnout"))

#######################################################################################
# ============
# = figure 5 =
# ============
# Posterior means and 95% highest probability densities (HPD) of GLMDM probit and GLM probit models for immigrants' vote choice in the 2009 German federal election.

	pdf("coef_votechoice_bw.pdf", width=6, height=5)
	par(mar=c(3,8,2,2))
	CoefPlot(cdu.glmdm, "Immigrant Vote Choice CDU/CSU")
	axis(2, at=(length(coef(cdu.glm)):1), labels=names(coef(cdu.glm)), las=1, cex.axis=.7)
	text(1.8, 17.2, "GLM", cex=.7, col="grey48", pos=4)
	text(1.8, 16.6, "GLMDM", cex=.7, pos=4)
	points(1.8, 17.2, pch=21, cex=.7)
	points(1.8, 16.6, pch=17, cex=.7)
	AddPlot(cdu.glm)
	box()
	dev.off()

#######################################################################################
# ===========
# = table 2 =
# ===========
# Posterior means and 95% highest probability densities (HPD) of the GLM and GLMDM probit models for immigrants' vote choice in the 2009 German federal election.

	res.tab.2 <- round(cbind(glm.tab(cdu.glm), mcmc.table(cdu.glmdm)[,c(1,3,4)]), 2)
	xtable(res.tab.2, type="latex", caption=c("Results for vote choice"))

#######################################################################################
# ===================
# = end source code =
# ===================